{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Otto商品分类——线性SVM\n",
    "\n",
    "我们以Kaggle 2015年举办的Otto Group Product Classification Challenge竞赛数据为例，分别调用\n",
    "缺省参数LinearSVC、\n",
    "LinearSVC + CV进行参数调优（手动实现循环）。\n",
    "\n",
    "Otto数据集是著名电商Otto提供的一个多类商品分类问题，类别数=9. 每个样本有93维数值型特征（整数，表示某种事件发生的次数，已经进行过脱敏处理）。 竞赛官网：https://www.kaggle.com/c/otto-group-product-classification-challenge/data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 首先 import 必要的模块\n",
    "import pandas as pd \n",
    "import numpy as np\n",
    "\n",
    "#竞赛的评价指标为logloss，但LinearSVC不支持概率\n",
    "#所以在这个例子中我们用正确率accuracy_score作为模型选择的度量\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import confusion_matrix\n",
    "\n",
    "from matplotlib import pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 读取数据\n",
    "# path to where the data lies\n",
    "dpath = './data/'\n",
    "\n",
    "# 采用原始特征 + tf_idf特征\n",
    "#原始特征 + tf_idf特征对线性SVM训练还是很快，RBF核已慢得不行\n",
    "# RBF核只用tf_idf特征\n",
    "train1 = pd.read_csv(dpath +\"Otto_FE_train_org.csv\")\n",
    "train2 = pd.read_csv(dpath +\"Otto_FE_train_tfidf.csv\")\n",
    "#train = pd.read_csv(dpath +\"Otto_FE_train_tfidf.csv\")\n",
    "\n",
    "#去掉多余的id\n",
    "train2 = train2.drop([\"id\",\"target\"], axis=1)\n",
    "train =  pd.concat([train1, train2], axis = 1, ignore_index=False)\n",
    "train.head()\n",
    "\n",
    "del train1\n",
    "del train2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#train.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 准备数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将类别字符串变成数字\n",
    "# drop ids and get labels\n",
    "y_train = train['target']   #形式为Class_x\n",
    "X_train = train.drop([\"id\", \"target\"], axis=1)\n",
    "\n",
    "#保存特征名字以备后用（可视化）\n",
    "feat_names = X_train.columns \n",
    "\n",
    "#sklearn的学习器大多之一稀疏数据输入，模型训练会快很多\n",
    "from scipy.sparse import csr_matrix\n",
    "X_train = csr_matrix(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 训练样本6w+，交叉验证太慢，用train_test_split估计模型性能\n",
    "# SVM对大样本数据集支持不太好\n",
    "# 从将数据集中随机抽取10000条记录，用于下述作业中模型的训练\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, train_size = 10000,random_state = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10000, 186)\n"
     ]
    }
   ],
   "source": [
    "print (X_train_part.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型训练"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 默认参数的 SVC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import LinearSVC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
       "          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
       "          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
       "          verbose=0)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#LinearSVC不能得到每类的概率（只有predict函数，没有predict_proba函数），在Otto数据集要求输出每类的概率，这里只是示意SVM的使用方法\n",
    "#https://xacecask2.gitbooks.io/scikit-learn-user-guide-chinese-version/content/sec1.4.html\n",
    "#1.4.1.2. 得分与概率\n",
    "#1. 生成学习器实例\n",
    "SVC1 = LinearSVC()\n",
    "\n",
    "#2. 模型训练\n",
    "SVC1.fit(X_train_part, y_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy is:  0.7596283588418983\n",
      "Classification report for classifier LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
      "          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
      "          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
      "          verbose=0):\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "     Class_1       0.63      0.30      0.41      1615\n",
      "     Class_2       0.65      0.86      0.74     13479\n",
      "     Class_3       0.51      0.33      0.40      6706\n",
      "     Class_4       0.72      0.14      0.23      2267\n",
      "     Class_5       0.94      0.96      0.95      2312\n",
      "     Class_6       0.93      0.93      0.93     11883\n",
      "     Class_7       0.71      0.61      0.65      2399\n",
      "     Class_8       0.84      0.92      0.88      7077\n",
      "     Class_9       0.79      0.86      0.83      4140\n",
      "\n",
      "    accuracy                           0.76     51878\n",
      "   macro avg       0.75      0.66      0.67     51878\n",
      "weighted avg       0.75      0.76      0.74     51878\n",
      "\n",
      "\n",
      "Confusion matrix:\n",
      "[[  487    64    14     8     8   140    48   382   464]\n",
      " [   15 11626  1435    61    74    43   102    68    55]\n",
      " [    5  4209  2223    26     9    16   155    37    26]\n",
      " [    0  1356   386   308    17   114    62    17     7]\n",
      " [    3    71     4     0  2224     3     0     4     3]\n",
      " [   71   118    24    12     4 11006   166   270   212]\n",
      " [   57   260   193     7    18   171  1462   200    31]\n",
      " [   72    84    35     2     6   207    51  6494   126]\n",
      " [   69    73     6     5    10   158    23   218  3578]]\n"
     ]
    }
   ],
   "source": [
    "#3. 在校验集上测试，估计模型性能\n",
    "y_predict = SVC1.predict(X_val)\n",
    "\n",
    "print(\"accuracy is: \",accuracy_score(y_val, y_predict))\n",
    "\n",
    "print(\"Classification report for classifier %s:\\n%s\\n\"\n",
    "      % (SVC1, classification_report(y_val, y_predict)))\n",
    "print(\"Confusion matrix:\\n%s\" % confusion_matrix(y_val, y_predict))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用原始特征 + tfidf特征的线性SVM分类性能：accuracy is： 0.76430187459599219\n",
    "\n",
    "class_1,class_3和class_4分类效果不好。\n",
    "是因为这几类样本数目少？（class_6类的样本数目也不多）。后面采用类别权重试试\n",
    "(用class_weight='balanced'效果更差了)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 线性SVM正则参数调优"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "线性SVM LinearSVC的需要调整正则超参数包括C（正则系数，一般在log域（取log后的值）均匀设置候选参数）和正则函数penalty（L2/L1） \n",
    "\n",
    "采用交叉验证，网格搜索步骤与Logistic回归正则参数处理类似，在此略。\n",
    "\n",
    "这里我们用校验集（X_val、y_val）来估计模型性能"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#单组超参数情况，模型在训练集上训练，在校验集上的测试的测试性能\n",
    "def fit_grid_point_Linear(C, X_train, y_train, X_val, y_val):\n",
    "    \n",
    "    # 在训练集上训练SVC\n",
    "    SVC2 =  LinearSVC( C = C)\n",
    "    SVC2 = SVC2.fit(X_train, y_train)\n",
    "    \n",
    "    # 在校验集上返回accuracy\n",
    "    accuracy = SVC2.score(X_val, y_val)\n",
    "    \n",
    "    print(\"C= {} : accuracy= {} \" .format(C, accuracy))\n",
    "    return accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C= 0.1 : accuracy= 0.750183121939936 \n",
      "C= 1.0 : accuracy= 0.7596283588418983 \n",
      "C= 10.0 : accuracy= 0.7618836501021627 \n",
      "C= 100.0 : accuracy= 0.760187362658545 \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No handles with labels found to put in legend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C= 1000.0 : accuracy= 0.6656771656578897 \n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEGCAYAAAB/+QKOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3deZRU5bnv8e/D2DLI2A4REFA0okFCWpxFnIIawSkKTkzVrtzE3MSVc7xmztGVk+Hee7znJN7cKEoAo4g44UhQUGNEpXFGRJHE0MGBIKAocz/3j3e3XXZXd1c3tWvX8PusVYuqvXdVPbXprl/vd+/3fc3dERERaaxD0gWIiEhhUkCIiEhGCggREclIASEiIhkpIEREJKNOSReQK/379/fBgwcnXYaISFFZvnz5P929MtO6kgmIwYMHU1NTk3QZIiJFxczeaW6dmphERCQjBYSIiGSkgBARkYxK5hyEiEi527lzJ7W1tWzbtq3JuoqKCgYMGEDnzp2zfj0FhIhIiaitraVnz54MHjwYM/tsubuzYcMGamtrGTJkSNavpyYmEZESsW3bNvr16/e5cAAwM/r165fxyKIlCggRkRLSOBxaW94SNTGJ7KG6OvjoI9i8GTZtCrf6+/X/duwI3btDjx7hln6/8eM2NBGLxEoBIWVv+/bMX+otfeGn3//4Y8jltCpdumQfJo0fN7eue3fopN92aaNYf2TMbBzwn0BHYIa7/7LR+huAsdHDbsA+7t47WjcImAEMBBw4y93/Fme9UnzcYcuW7L7Im1vWWrNshw7Qqxf07t3w79Ch4d/0ZY3/rb/fq1dDnVu2wCefNNxv/Lildf/4R9P1dXXZ76uuXfc8aDLd76CG6oLi7hmbk9ozOVxsAWFmHYEbgdOBWmCZmS1w99frt3H3q9O2/zbw5bSXmA383N0XmVkPoA2/ClIsdu5s/1/umzeHW2tfkhUVTb+4Bw9u+Us9/d8ePaAdzbdN9O0bbrniHo5+9jR4Pvyw6bq2fJfstdeeB03jx9265Wafl5uKigo2bNjQ5ER1/VVMFRUVbXq9OI8gRgOr3X0NgJnNBSYArzez/STgp9G2w4FO7r4IwN23xFintJM7fPpp61/qLa379NPW36f+r/D6L/CBA+FLX8ruy71Xr/CXcykyC+FXUQH9++fudd1h69a2B03jxx988Pl12fxf16uuhptuyt1nKhcDBgygtraW9evXN1lX3w+iLeIMiAOAtWmPa4GjM21oZgcCQ4DF0aJDgE1mdk+0/DHgWnff3eh5VwJXAgwaNCinxcvnbdgATz4JixfDU0/BunXhC3737paf17lzw5d3/Rf2AQe0/qVev65nTzVh5JtZ+Au+WzfYZ5/cvW5dXQiJ1oLlwQdh1iz493/PbfCVg86dO7epn0Nr4gyITAeIzR24TgTmpwVAJ+BEQpPT34E7gSnALZ97MfebgJsAqqqqcniaUDZvDkGweDEsWQKvvBL+suzeHU44AU48MbsmmooKNRVI0KFDQxNSS048EUaMgDlz4OqrW95W4hVnQNQSTjDXGwCsa2bbicC3Gj33xbTmqfuAY2gUEJI7W7bA0083BMILL4S/+Coq4Pjj4frrYexYOOooXYYp8frSl+Doo2HGDPjud/UHRpLiDIhlwDAzGwL8gxAClzTeyMwOBfoASxs9t4+ZVbr7euAUQJM95NDWrfDMMyEMFi+GZctg167w5X/MMfDjH4dAOOaY0m3Dl8JVXQ2pFDz7LBx7bNLVlK/YAsLdd5nZVcBCwmWut7r7CjO7Dqhx9wXRppOAuZ52DZa77zazfwEet3Aqfjlwc1y1loPt2+G55xoC4dlnYceO0IHrqKPgmmtCIBx3XGh7FknSxReHo4ebb1ZAJMnac21sIaqqqnLNKNdg506oqWkIhGeeCUcNZjBqVAiDU04J5xN69ky6WpGmqqvh9tvh3Xdh772TrqZ0mdlyd6/KtE59K0vE7t3w4oshEJYsgT//OZxXgHDC78orQyCcdFI4gSxS6Kqrw3mIuXPDz6/kn44gilRdHbz6akMgPPlkuPII4LDDwhHC2LFw8sm6VFCKkzsceWS4UOL555OupnTpCKIEuMMbbzRcZfTEE6FvAsDBB8NFFzUEwv77J1mpSG6YhRPV3/kOvPxyCAvJLwVEgXKHt9/+fCC8915YN2gQnHNOw1HCwIEtvpRI0brssnABxYwZ8JvfJF1N+VFAFJB33mk4qbxkCdTWhuX77w+nntoQCEOG6NpwKQ99+8IFF8Btt8Gvfx3GfZL8UUAkaN26zwfCX/8alldWNoTB2LFwyCEKBClfqVS4munuu8MRheSPAiKPPvggNBXVh8Kbb4blffrAmDFhWIGxY+HwwxUIIvXGjIGDDgrNTAqI/FJAxOjDD8PVRfVXGr32Wljes2e43LT+0tMRI0KHNRFpqkOHcBTx/e+HP6oOOSTpisqHAiKHPvooDHBXHwgvvRRONnfrFjqkXXppCIRRozS7l0hbTJ4MP/oR3HIL/OpXSVdTPtQPYg988kkY4K4+EGpqQv+Erl3DkBX15xBGjw7TSIpI+517LixdGi7e0ICRuaN+EDmydWv4Aa0PhOefD0NadOoUBrX74Q9DIBx7bOjcIyK5U10N998f5os477ykqykPCogW7NjRMMDdkiUhHLZvD22iVVXwve+FQDj++DBPgojE56tfDZNN3XyzAiJfFBBpdu2C5csbLjv9y1/CDFhmMHIkXHVVCIQTT9TgYSL51qkTTJ0KP/85rF2rDqL5UPYBsWlTOPG1eHEY4O7jj8PyI44IV06MHRuuOMrlZPMi0j7Tp4eAmDkTfvKTpKspfWV/knrjxjCY3bBhDUNgjxmT27l4RSR3zjgDVq2CNWt0eXgu6CR1C/r0CWMcVVYmXYmIZCOVChMKPfZYOC8h8emQdAGFQOEgUjwmTIB+/ULPaomXAkJEikrXrqHj3P33h+FrJD4KCBEpOtOnhz5Is2cnXUlpU0CISNEZPjyMVjBjRhjORuKhgBCRolRdHa5m+stfkq6kdCkgRKQoff3rYWTkm29OupLSpYAQkaLUvTtccgncdVfo8Cq5p4AQkaJVXR0G0bzjjqQrKU0KCBEpWqNGhXHS1MwUDwWEiBQts9Cz+sUX4YUXkq6m9CggRKSoXXppmH9FPatzTwEhIkWtd+9wRdMf/xhmeZTcUUCISNFLpcKc8PPnJ11JaVFAiEjRO/FEOOQQNTPlmgJCRIpe/cnqp5+GN95IuprSoYAQkZJwxRVhWlIdReROrAFhZuPMbJWZrTazazOsv8HMXopub5rZpkbr9zazf5jZb+OsU0SK3777wvjxMGsW7NiRdDWlIbaAMLOOwI3AmcBwYJKZDU/fxt2vdveR7j4S+A1wT6OXuR54Mq4aRaS0VFfDP/8JCxYkXUlpiPMIYjSw2t3XuPsOYC4woYXtJwGfdZg3s68A+wJ/irFGESkhp58OAweqZ3WuxBkQBwBr0x7XRsuaMLMDgSHA4uhxB+B/A//a0huY2ZVmVmNmNevXr89J0SJSvDp2hGnTYNEi+Nvfkq6m+MUZEJZhWXNTe0wE5rv77ujxN4GH3X1tM9uHF3O/yd2r3L2qUhNLiwghIABmzky2jlIQZ0DUAgPTHg8A1jWz7UTSmpeAY4GrzOxvwP8CrjCzX8ZRpIiUlkGD4KtfhVtvhd27W99emhdnQCwDhpnZEDPrQgiBJqeOzOxQoA+wtH6Zu1/q7oPcfTDwL8Bsd29yFZSISCapFNTWwsKFSVdS3GILCHffBVwFLARWAvPcfYWZXWdm49M2nQTMddfMsiKSG+ecA/vsoz4Re8pK5Xu5qqrKa2pqki5DRArENdfADTfA2rWw335JV1O4zGy5u1dlWqee1CJSkqZPh127Qsc5aR8FhIiUpEMPDYP4zZgBJdJQkncKCBEpWdXVsHo1PKnxGNpFASEiJeuCC6BXL52sbi8FhIiUrG7dwpSk8+fDxo1JV1N8FBAiUtKqq2H79jAlqbSNAkJEStrIkfCVr4QB/HSyum0UECJS8lIpeOUVUFeptlFAiEjJu+SScD5CJ6vbRgEhIiVv773hoovg9tthy5akqykeCggRKQupVAiHefOSrqR4KCBEpCwcdxwcdpiamdpCASEiZcEsHEUsXQorViRdTXFQQIhI2bj8cujcWUcR2VJAiEjZqKyEc8+F2bND5zlpmQJCRMpKdTV8+CHce2/SlRQ+BYSIlJVTT4UDD1QzUzYUECJSVjp0CJMJPf44rFmTdDWFTQEhImVn6tQQFLfcknQlhU0BISJlZ8AAOPNMmDkzTEsqmSkgRKQspVLw7rvwyCNJV1K4FBAiUpbOPhv2208nq1uigBCRstS5M0yZAg89BOvWJV1NYVJAiEjZmj4ddu+GP/wh6UoKkwJCRMrWwQfD2LHhaqa6uqSrKTwKCBEpa6lU6A+xZEnSlRQeBYSIlLXzz4c+fXSyOhMFhIiUtYqKMMrrPffAhg1JV1NYsgoIM7vbzM42MwWKiJSc6dNhxw6YMyfpSgpLtl/4vwMuAd4ys1+a2RdjrElEJK9GjIDRo0Mzk3vS1RSOrALC3R9z90uBUcDfgEVm9oyZTTWzznEWKCKSD6lUmGnuueeSrqRwZN1kZGb9gClACngR+E9CYCyKpTIRkTyaOBG6d4ebb066ksKR7TmIe4A/A92Ac9x9vLvf6e7fBnq08LxxZrbKzFab2bUZ1t9gZi9FtzfNbFO0fKSZLTWzFWb2ipld3L6PJyKSnZ49Q0jMnQsffZR0NYUh2yOI37r7cHf/hbu/m77C3asyPcHMOgI3AmcCw4FJZja80XOvdveR7j4S+A1wT7TqU+AKdz8cGAf8HzPrnfWnEhFph1QKPv0U7rwz6UoKQ7YBcVj6F7SZ9TGzb7bynNHAandf4+47gLnAhBa2nwTcAeDub7r7W9H9dcAHQGWWtYqItMvRR8MRR6iZqV62AVHt7pvqH7j7RqC6leccAKxNe1wbLWvCzA4EhgCLM6wbDXQB3s6w7kozqzGzmvXr17f6IUREWmIWjiKWLYOXX066muRlGxAdzMzqH0TNR11aeY5lWNbcBWQTgfnuvvtzL2C2PzAHmOruTUZKcfeb3L3K3asqK3WAISJ77rLLoEsXzTYH2QfEQmCemZ1qZqcQmoIebeU5tcDAtMcDgOYG1Z0YveZnzGxv4CHgR+7+bJZ1iojskX794IILQqe5rVuTriZZ2QbE/yA0//w34FvA48A1rTxnGTDMzIaYWRdCCCxovJGZHQr0AZamLesC3AvMdve7sqxRRCQnUinYtCkMv1HOsu0oV+fuv3P3C939Anf/fePmoAzP2QVcRTj6WAnMc/cVZnadmY1P23QSMNf9c/0XLwJOAqakXQY7sk2fTESknU4+GYYO1QB+5ln0KzezYcAvCJerVtQvd/eh8ZXWNlVVVV5TU5N0GSJSIn7xC/jBD+DNN2HYsKSriY+ZLW+uu0K2TUwzCeMx7QLGArMJJ49FRErS5MnQsWN5n6zONiD2cvfHCUcc77j7z4BT4itLRCRZX/gCnH12mI50586kq0lGtgGxLRrq+y0zu8rMzgP2ibEuEZHEpVLw/vvw0ENJV5KMbAPiu4RxmP478BXgMmByXEWJiBSCM88MRxLl2rO61YCIOsVd5O5b3L3W3adGVzKpb4KIlLROnWDqVHj0UVi7tvXtS02rARFdzvqV9J7UIiLlYto0qKsL5yLKTbZNTC8C95vZ5WZ2fv0tzsJERArB0KFw2mnhaqa6JgP+lLZsA6IvsIFw5dI50e1rcRUlIlJIUil45x147LGkK8mvTtls5O5T4y5ERKRQnXtuGKNpxgw444ykq8mfrALCzGaSYSRWd5+W84pERApM165wxRXw29/C+vVQLoNHZ9vE9CBhZNWHCAP17Q1siasoEZFCM3166DA3e3bSleRPVmMxNXlS6DT3mLsXTG9qjcUkInE77jjYuBFefz1MLlQKcjEWU2PDgEHtL0lEpPhUV8Mbb8AzzyRdSX5kFRBm9rGZfVR/Ax4gzBEhIlI2vv516NmzfHpWZzsfRE933zvtdoi73x13cSIihaRHD5g0CebNg82bk64mftkeQZxnZr3SHvc2s3PjK0tEpDClUmEq0jvuaH3bYpftOYifuvtneenum4CfxlOSiEjhqqqCI48sj2ambAMi03ZZ9aEQESklZuEo4oUXwq2UZRsQNWb2H2Z2kJkNNbMbgOVxFiYiUqguvRQqKkp/trlsA+LbwA7gTmAesBX4VlxFiYgUsj594MIL4Y9/hE8/Tbqa+GR7FdMn7n6tu1dFtx+4+ydxFyciUqhSqXAl0/z5SVcSn2yvYlpkZr3THvcxs4XxlSUiUthOOgmGDQsD+JWqbJuY+kdXLgHg7hvRnNQiUsbqT1b/+c+hd3UpyjYg6szss6E1zGwwGUZ3FREpJ1dcEaYlLdWT1dkGxA+Bp81sjpnNAZ4Evh9fWSIihW+//eCcc2DWLNixI+lqci/bk9SPAlXAKsKVTN8jXMkkIlLWqqvDHBEPPJB0JbmX7UnqFGEeiO9FtznAz+IrS0SkOJxxBgwYUJo9q7NtYvoOcBTwjruPBb4MrI+tKhGRItGxI0ybBn/6U5i3upRkGxDb3H0bgJl1dfc3gEPjK0tEpHhMiyZfnjkz2TpyLduAqI36QdwHLDKz+4F18ZUlIlI8DjwwNDXdeivs3p10NbmT7Unq89x9k7v/DPgxcAug4b5FRCKpFKxdG5qaSkWbpxx19yfdfYG7l+BFXSIi7TN+PFRWllbP6vbOSZ0VMxtnZqvMbLWZXZth/Q1m9lJ0e9PMNqWtm2xmb0W3yXHWKSKyp7p0gcmTYcECeP/9pKvJjdgCwsw6AjcCZwLDgUlmNjx9G3e/2t1HuvtI4DfAPdFz+xImJDoaGA381Mz6xFWriEguTJ8Ou3aFjnOlIM4jiNHAandfEzVHzQUmtLD9JKB+Er+vAovc/cNo3KdFwLgYaxUR2WNf/CKccEJoZvISGIwozoA4AFib9rg2WtaEmR0IDAEWt+W5ZnalmdWYWc369eqWISLJq66Gt96Cp55KupI9F2dAWIZlzWXqRGC+u9dfIJbVc939pvo5KiorK9tZpohI7lx4Iey9d2mcrI4zIGqBgWmPB9B834mJNDQvtfW5IiIFo1u3MCXp/PmwcWPS1eyZOANiGTDMzIaYWRdCCCxovJGZHQr0AZamLV4InBFNTNQHOCNaJiJS8KqrYdu2MCVpMYstINx9F3AV4Yt9JTDP3VeY2XVmNj5t00nAXPeGUzru/iFwPSFklgHXRctERArel78Mo0aFAfyK+WS1eTFXn6aqqspramqSLkNEBIDf/Q6++U1YtgyqqpKupnlmttzdM1YYa0c5EZFydcklsNdexX2yWgEhIhKDXr3goovg9tthy5akq2kfBYSISExSKfj4Y7jrrqQraR8FhIhITI4/PvSuLtZmJgWEiEhMzMJRxDPPwIoVSVfTdgoIEZEYXX45dO4Mt9ySdCVtp4AQEYnRPvvAhAkwezZs3550NW2jgBARiVl1NWzYAPfdl3QlbaOAEBGJ2WmnhXmri+1ktQJCRCRmHTrAtGnw2GPw178mXU32FBAiInkwdWoIimI6Wa2AEBHJg4EDYdw4mDkzTEtaDBQQIiJ5kkrBunXw6KNJV5IdBYSISJ587Wuw775hGPBioIAQEcmTzp1hyhR46KFwJFHoFBAiInk0fTrs3g2zZiVdSesUECIieTRsGIwZE65mqqtLupqWKSBERPKsuhrefhueeCLpSlqmgBARybPzz4fevQu/Z7UCQkQkz/baCy67DO6+O4zRVKgUECIiCaiuhh074Lbbkq6keQoIEZEEjBgBRx0Vmpnck64mMwWEiEhCUil47TV4/vmkK8lMASEikpBJk6B798LtWa2AEBFJSM+ecPHFMHcufPxx0tU0pYAQEUlQKgWffAJ33pl0JU0pIEREEnTMMXD44YXZzKSAEBFJkFk4inj+eXjllaSr+TwFhIhIwi67DLp0KbzZ5hQQIiIJ698fzjsP5syBbduSrqaBAkJEpABUV8PGjXDPPUlX0kABISJSAMaOhSFDCmsAv1gDwszGmdkqM1ttZtc2s81FZva6ma0ws9vTlv86WrbSzP7LzCzOWkVEktShQ5hMaMkSWL066WqC2ALCzDoCNwJnAsOBSWY2vNE2w4DvA8e7++HAd6PlxwHHAyOAI4CjgDFx1SoiUgimTg1BUSgnq+M8ghgNrHb3Ne6+A5gLTGi0TTVwo7tvBHD3D6LlDlQAXYCuQGfg/RhrFRFJ3Be+AGefDX/4A+zcmXQ18QbEAcDatMe10bJ0hwCHmNlfzOxZMxsH4O5LgSXAu9FtobuvbPwGZnalmdWYWc369etj+RAiIvmUSsF778HDDyddSbwBkemcQeNBbTsBw4CTgUnADDPrbWYHA4cBAwihcoqZndTkxdxvcvcqd6+qrKzMafEiIkk46yzYf//C6FkdZ0DUAgPTHg8A1mXY5n533+nufwVWEQLjPOBZd9/i7luAR4BjYqxVRKQgdOoUzkU88gjU1iZbS5wBsQwYZmZDzKwLMBFY0Gib+4CxAGbWn9DktAb4OzDGzDqZWWfCCeomTUwiIqVo2jSoqwvnIpIUW0C4+y7gKmAh4ct9nruvMLPrzGx8tNlCYIOZvU445/Cv7r4BmA+8DbwKvAy87O4PxFWriEghOeggOPXUcDVTXV1ydZgX6lx3bVRVVeU1NTVJlyEikhNz54YJhf70Jzj99Pjex8yWu3tVpnXqSS0iUoDOPRf69k22Z7UCQkSkAFVUwOWXw733QlJX8SsgREQKVCoVOszNmZPM+ysgREQK1BFHhBnnZsyAJE4XKyBERApYKgUrV8LSpfl/bwWEiEgBu/hi6NEjmZ7VCggRkQLWo0e43HXePNi8Ob/vrYAQESlwqRR8+mnoG5FPCggRkQJ31FEwYkT+m5kUECIiBc4sHEUsXw4vvpi/91VAiIgUgUsvha5d8zvbnAJCRKQI9O0LF14It90WzkfkgwJCRKRIpFLhSqa7787P+ykgRESKxJgxcPDB+RvATwEhIlIkzGD6dHjqKVi1Kv73U0CIiBSRKVOgY8f8nKxWQIiIFJH99oNzzoFZs2DHjnjfSwEhIlJkUin44AN4IOaJmBUQIiJFZtw4OOCA+E9WKyBERIpMx44wbRosXAh//3t876OAEBEpQtOmhX9nzozvPRQQIiJFaPBgOP30cDXT7t3xvIcCQkSkSKVSsHYtLFoUz+srIEREitT48dC/f3wnqzvF87IiIhK3rl3h6qvD4H3uoad1LikgRESK2A9+EN9rq4lJREQyUkCIiEhGCggREclIASEiIhkpIEREJCMFhIiIZKSAEBGRjBQQIiKSkbl70jXkhJmtB97Zg5foD/wzR+XkkupqG9XVNqqrbUqxrgPdvTLTipIJiD1lZjXuXpV0HY2prrZRXW2jutqm3OpSE5OIiGSkgBARkYwUEA1uSrqAZqiutlFdbaO62qas6tI5CBERyUhHECIikpECQkREMirbgDCzr5vZCjOrM7NmLw8zs3FmtsrMVpvZtXmoq6+ZLTKzt6J/+zSz3W4zeym6LYixnhY/v5l1NbM7o/XPmdnguGppQ01TzGx92v5JxV1T9L63mtkHZvZaM+vNzP4rqvsVMxtVIHWdbGab0/bXT/JU10AzW2JmK6Pfxe9k2Cbv+yzLuvK+z8yswsyeN7OXo7r+LcM2uf19dPeyvAGHAYcCTwBVzWzTEXgbGAp0AV4Ghsdc16+Ba6P71wK/ama7LXnYR61+fuCbwP+L7k8E7iyAmqYAv03gZ+okYBTwWjPrzwIeAQw4BniuQOo6GXgwgf21PzAqut8TeDPD/2Xe91mWdeV9n0X7oEd0vzPwHHBMo21y+vtYtkcQ7r7S3Ve1stloYLW7r3H3HcBcYELMpU0AZkX3ZwHnxvx+Lcnm86fXOx841SzXM+O2uaZEuPtTwIctbDIBmO3Bs0BvM9u/AOpKhLu/6+4vRPc/BlYCBzTaLO/7LMu68i7aB1uih52jW+OrjHL6+1i2AZGlA4C1aY9rif8HZV93fxfCDyqwTzPbVZhZjZk9a2ZxhUg2n/+zbdx9F7AZ6BdTPdnWBHBB1CQx38wGxlhPWyTx85StY6Omi0fM7PB8v3nUFPJlwl/F6RLdZy3UBQnsMzPraGYvAR8Ai9y92f2Vi9/HTu19YjEws8eA/TKs+qG735/NS2RYtsfXBbdUVxteZpC7rzOzocBiM3vV3d/e09oayebzx7KPWpDN+z0A3OHu283sG4S/qE6JsaZs5XtfZesFwng8W8zsLOA+YFi+3tzMegB3A991948ar87wlLzss1bqSmSfuftuYKSZ9QbuNbMj3D393FJO91dJB4S7n7aHL1ELpP/1OQBYt4ev2WJdZva+me3v7u9Gh9IfNPMa66J/15jZE4S/cnIdENl8/vptas2sE9CLeJszWq3J3TekPbwZ+FWM9bRFLD9Peyr9y8/dHzaz/2tm/d099kHpzKwz4Uv4j+5+T4ZNEtlnrdWV5D6L3nNT9Hs/DkgPiJz+PqqJqWXLgGFmNsTMuhBO+sR2xVBkATA5uj8ZaHKkY2Z9zKxrdL8/cDzwegy1ZPP50+u9EFjs0RmymLRaU6M26vGENuRCsAC4Iroy5xhgc31zYpLMbL/6dmozG034XtjQ8rNy8r4G3AKsdPf/aGazvO+zbOpKYp+ZWWV05ICZ7QWcBrzRaLPc/j7m8yx8Id2A8whpux14H1gYLf8C8HDadmcRrmJ4m9A0FXdd/YDHgbeif/tGy6uAGdH944BXCVfwvApMj7GeJp8fuA4YH92vAO4CVgPPA0PzsI9aq+kXwIpo/ywBvpinn6k7gHeBndHP1nTgG8A3ovUG3BjV/SrNXD2XQF1Xpe2vZ4Hj8lTXCYTmj1eAl6LbWUnvsyzryvs+A0YAL0Z1vQb8JFoe2++jhtoQEZGM1MQkIiIZKSBERCQjBYSIiGSkgBARkYwUECIikpECQqQNzGxL61u1+Pz5Ue93zKyHmf3ezN6ORud8yptStVwAAAHgSURBVMyONrMu0f2S7sgqhU8BIZIn0Xg9Hd19TbRoBqGX6zB3P5wwCm1/D4MQPg5cnEihIhEFhEg7RD17/6eZvWZmr5rZxdHyDtGwCyvM7EEze9jMLoyedilRz3gzOwg4GviRu9dBGDbF3R+Ktr0v2l4kMTqEFWmf84GRwJFAf2CZmT1FGPZkMPAlwki8K4Fbo+ccT+jVDHA48JKHwdcyeQ04KpbKRbKkIwiR9jmBMGLsbnd/H3iS8IV+AnCXu9e5+3uEoT7q7Q+sz+bFo+DYYWY9c1y3SNYUECLt09wkLC1NzrKVMFYOhHF8jjSzln4HuwLb2lGbSE4oIETa5yng4mgCl0rCtJ7PA08TJivqYGb7EqamrLcSOBjAw9wdNcC/pY0KOszMJkT3+wHr3X1nvj6QSGMKCJH2uZcwqubLwGLgmqhJ6W7CiKmvAb8nzES2OXrOQ3w+MFKEiaNWm9mrhLkr6uc6GAs8HO9HEGmZRnMVyTEz6+FhprF+hKOK4939vWgM/yXR4+ZOTte/xj3A9731edNFYqOrmERy78FoYpcuwPXRkQXuvtXMfkqYN/jvzT05mgjpPoWDJE1HECIikpHOQYiISEYKCBERyUgBISIiGSkgREQkIwWEiIhk9P8BmgCWViCWzWsAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "#需要调优的参数\n",
    "#SVM太慢，每次只调一个参数（这里只调C，penalty为‘l2'）\n",
    "C_s = np.logspace(-1, 3, 5)# logspace(a,b,N)把10的a次方到10的b次方区间分成N份  \n",
    "#penalty_s = ['l1','l2']\n",
    "\n",
    "accuracy_s = []\n",
    "for i, oneC in enumerate(C_s):\n",
    "#    for j, penalty in enumerate(penalty_s):\n",
    "    tmp = fit_grid_point_Linear(oneC, X_train_part, y_train_part, X_val, y_val)\n",
    "    accuracy_s.append(tmp)\n",
    "\n",
    "x_axis = np.log10(C_s)\n",
    "#for j, penalty in enumerate(penalty_s):\n",
    "plt.plot(x_axis, np.array(accuracy_s), 'b-')\n",
    "    \n",
    "plt.legend()\n",
    "plt.xlabel( 'log(C)' )                                                                                                      \n",
    "plt.ylabel( 'accuracy' )\n",
    "#plt.savefig('SVM_Otto.png' )\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10.0\n"
     ]
    }
   ],
   "source": [
    "### 最佳超参数\n",
    "index = np.argmax(accuracy_s, axis=None)\n",
    "Best_C = C_s[ index ]\n",
    "\n",
    "print(Best_C)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## 找到最佳参数后，用全体训练数据训练模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SVC训练SVC，支持概率输出\n",
    "Best_C = 100\n",
    "\n",
    "SVC3 = LinearSVC(C = Best_C)\n",
    "SVC3.fit(X_train, y_train)\n",
    "\n",
    "#保持模型，用于后续测试\n",
    "import pickle\n",
    "pickle.dump(SVC3, open(\"Otto_LinearSVC.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
